#script to generate egucems202111_2017NEI_POINT_202404.csv input file for Stu's RELPT.sub and point.sub code
#use egucems2_2017NEI_POINT_20200412 as template, reformat CEMS_NEI_match_flat and NEI_EGU_ID to the template format

#############################################################################################################################
#install libraries
#install.packages("plyr")
library(plyr)
#install.packages("dplyr")
library(dplyr)
#install.packages("readr")
library(readr)
#install.packages("psych")
library(psych)
#install.packages("ggplot2")
library(ggplot2)
#install.packages("ggpubr")
library(ggpubr)
#install.packages("geosphere")
library(geosphere)
#install.packages("data.table")
library(data.table)

#############################################################################################################################
#set working directory
setwd ("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Gen_RELPT_input/spreadsheets/output")

#############################################################################################################################
#read prepared .csv files
CEMS_NEI_match <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_CEMS_NEI_EGU/spreadsheets/output_202111/CEMS_NEI_match_flat_202111.csv") #emissions in metric tons per year
NEI_all <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Match_CEMS_NEI_EGU/spreadsheets/output_202111/NEI_EGU_ID.csv") #All 24072 NEI stacks, emissions in short tons per year
EGU_template <- read.csv("C:/Users/clyu/Desktop/GHG_CO2/Improving_inventory/V7_GRA2PES2021/Gen_RELPT_input/spreadsheets/input/egucems2_2017NEI_POINT_20200412.csv") #emissions in short tons per year

#############################################################################################################################
#Study the data
CEMS_matched_with_NEI = CEMS_NEI_match %>% distinct(ORIS_ID, .keep_all = TRUE) #3918 out of a total of 4372 CEMS points matched with NEI
NEI_matched_with_CEMS = CEMS_NEI_match %>% distinct(NEI_EGU_ID, .keep_all = TRUE) #3742 NEI points matched with CEMS points

#it happens that one CEMS matched with multiple NEI points
#it also happens that multiple CEMS matched with the same NEI point
#so, for each CEMS points distribute the CO2, SO2, NOx emissions to all the matched NEIs by NEI's CO emissions fraction in the group of matched NEIs for this CEMS point
CEMS_NEI_matched_simp = subset(CEMS_NEI_match,select = c("ORIS_ID","NEI_EGU_ID","Annual_NOx_Emis_MetricTon","Annual_SO2_Emis_MetricTon","Annual_CO2_Emis_MetricTon"))
CEMS_NEI_distri_emis = join(CEMS_NEI_matched_simp,NEI_all, by="NEI_EGU_ID", type="left")

#rename columns to match template style
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "COUNTRY_CD"] ="country_cd"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "REGION_CD"] ="region_cd"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "TRIBAL_CODE"] ="tribal_code"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "FACILITY_ID"] ="facility_id"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "UNIT_ID"] ="unit_id"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "REL_POINT_ID"] ="rel_point_id"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "PROCESS_ID"] ="process_id"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "SCC"] ="scc"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "FACILITY_NAME"] ="facility_name"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "ERPTYPE"] ="erptype"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "STKHGT"] ="stkhgt"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "STKDIAM"] ="stkdiam"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "STKTEMP"] ="stktemp"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "STKFLOW"] ="stkflow"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "STKVEL"] ="stkvel"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "NAICS"] ="naics"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "LON_NEI"] ="longitude"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "LAT_NEI"] ="latitude"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "FAC_SOURCE_TYPE"] ="fac_source_type"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "UNIT_TYPE_CODE"] ="unit_type_code"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "ORISPL"] ="oris_facility_code"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "ORISUN"] ="oris_boiler_id"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "ZIPCODE"] ="zipcode"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "Annual_NOx_Emis_MetricTon"] ="NOX_CEMS"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "Annual_SO2_Emis_MetricTon"] ="SO2_CEMS"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "Annual_CO2_Emis_MetricTon"] ="CO2_CEMS"
colnames(CEMS_NEI_distri_emis)[colnames(CEMS_NEI_distri_emis) == "ORIS_ID"] ="CEMS_ORIS_ID"

#change column format
CEMS_NEI_distri_emis$facility_name<-paste0("'",CEMS_NEI_distri_emis$facility_name,"'")

#remove useless columns
CEMS_NEI_distri_emis <- CEMS_NEI_distri_emis[,!names(CEMS_NEI_distri_emis) %in% c("X","Fuel")]

#convert CEMS emissions from MetricTon/year to ShortTon/year
CEMS_NEI_distri_emis$NOX_CEMS = CEMS_NEI_distri_emis$NOX_CEMS/0.907#0.907 MetricTon/ShortTon
CEMS_NEI_distri_emis$SO2_CEMS = CEMS_NEI_distri_emis$SO2_CEMS/0.907#0.907 MetricTon/ShortTon
CEMS_NEI_distri_emis$CO2_CEMS = CEMS_NEI_distri_emis$CO2_CEMS/0.907#0.907 MetricTon/ShortTon
#Now both CEMS and NEI emissions are in short ton/year

#do emissions distribution
#in some cases, one CEMS can match with more than one (a set of) NEIs
#in this case, distribute CO2 emissions to all these matching NEIs
#according to their relative CO emissions contribution to the set's total
#make NA CO_NEI to zero
CEMS_NEI_distri_emis$CO_NEI[is.na(CEMS_NEI_distri_emis$CO_NEI)] <- 0

##########################################################
#loop over ORIS_ID
CEMS_ORIS_ID_all = (CEMS_NEI_distri_emis %>% distinct(CEMS_ORIS_ID, .keep_all = TRUE))$CEMS_ORIS_ID #3918 
print(CEMS_ORIS_ID_all)
First_CEMS = TRUE
for (uniq_ORIS_ID in CEMS_ORIS_ID_all){
  #create ORIS_ID-specific subsets
  ORIS_ID_set = CEMS_NEI_distri_emis %>% filter(CEMS_ORIS_ID == uniq_ORIS_ID)
  #count number of rows, matched NEIs for the current ORIS_ID
  match_NEI_count = nrow(ORIS_ID_set)
  #if multiple row for current ORIS_ID, distribute emissions into the NEIs according their CO_NEI fractional contribution to the set's total
  if (match_NEI_count > 1){
    set_total_CO = sum(ORIS_ID_set$CO_NEI) #set_total_CO can be zero, causing NA
    if (set_total_CO!=0){
      ORIS_ID_set$CO_NEI_frac = ORIS_ID_set$CO_NEI/set_total_CO
      ORIS_ID_set$NOX_CEMS_toNEI = ORIS_ID_set$NOX_CEMS*ORIS_ID_set$CO_NEI_frac
      ORIS_ID_set$SO2_CEMS_toNEI = ORIS_ID_set$SO2_CEMS*ORIS_ID_set$CO_NEI_frac
      ORIS_ID_set$CO2_CEMS_toNEI = ORIS_ID_set$CO2_CEMS*ORIS_ID_set$CO_NEI_frac
    }else{#evenly
      ORIS_ID_set$CO_NEI_frac = 1/match_NEI_count
      ORIS_ID_set$NOX_CEMS_toNEI = ORIS_ID_set$NOX_CEMS*ORIS_ID_set$CO_NEI_frac
      ORIS_ID_set$SO2_CEMS_toNEI = ORIS_ID_set$SO2_CEMS*ORIS_ID_set$CO_NEI_frac
      ORIS_ID_set$CO2_CEMS_toNEI = ORIS_ID_set$CO2_CEMS*ORIS_ID_set$CO_NEI_frac
    }
  } else{ #if only one row for current ORIS_ID
    ORIS_ID_set$CO_NEI_frac = 1.0
    ORIS_ID_set$NOX_CEMS_toNEI = ORIS_ID_set$NOX_CEMS*ORIS_ID_set$CO_NEI_frac
    ORIS_ID_set$SO2_CEMS_toNEI = ORIS_ID_set$SO2_CEMS*ORIS_ID_set$CO_NEI_frac
    ORIS_ID_set$CO2_CEMS_toNEI = ORIS_ID_set$CO2_CEMS*ORIS_ID_set$CO_NEI_frac }
  
  #stack ORIS_ID_set
  if (First_CEMS){
    CEMS_NEI_distri_emis_p1 = ORIS_ID_set
    First_CEMS = FALSE
  } else{
    CEMS_NEI_distri_emis_p1 = rbind(CEMS_NEI_distri_emis_p1, ORIS_ID_set)
  }
}

#still need to sum emissions by unique NEI_EGU_ID
#because it also possible that one NEI associates with multiple CEMS
#loop over unique NEI_EGU_ID
NEI_EGU_ID_all = (CEMS_NEI_distri_emis %>% distinct(NEI_EGU_ID, .keep_all = TRUE))$NEI_EGU_ID #3742 

First_NEI = TRUE
for (uniq_NEI_EGU_ID in NEI_EGU_ID_all) {
  #create NEI_EGU_ID-specific subsets
  NEI_EGU_ID_set = CEMS_NEI_distri_emis_p1 %>% filter(NEI_EGU_ID == uniq_NEI_EGU_ID)
  
  NEI_EGU_ID_set$NOX_sumCEMS_toNEI = sum(NEI_EGU_ID_set$NOX_CEMS_toNEI)
  NEI_EGU_ID_set$SO2_sumCEMS_toNEI = sum(NEI_EGU_ID_set$SO2_CEMS_toNEI)
  NEI_EGU_ID_set$CO2_sumCEMS_toNEI = sum(NEI_EGU_ID_set$CO2_CEMS_toNEI)
  
  NEI_EGU_ID_set <- NEI_EGU_ID_set[,!names(NEI_EGU_ID_set) %in% c("CEMS_ORIS_ID","NOX_CEMS","SO2_CEMS","CO2_CEMS","CO_NEI_frac","NOX_CEMS_toNEI","SO2_CEMS_toNEI","CO2_CEMS_toNEI")]
  NEI_EGU_ID_emis = NEI_EGU_ID_set %>% distinct(across(everything()))
  
  #stack NEI_EGU_ID_set
  if (First_NEI){
    CEMS_NEI_distri_emis_p2 = NEI_EGU_ID_emis
    First_NEI = FALSE
  } else{
    CEMS_NEI_distri_emis_p2 = rbind(CEMS_NEI_distri_emis_p2, NEI_EGU_ID_emis) #3742
  }  
}

#check if all matched CEMS emissions are distributed correctly into NEI
NOX_CEMS_total = sum(CEMS_matched_with_NEI$Annual_NOx_Emis_MetricTon)/0.907
SO2_CEMS_total = sum(CEMS_matched_with_NEI$Annual_SO2_Emis_MetricTon)/0.907
CO2_CEMS_total = sum(CEMS_matched_with_NEI$Annual_CO2_Emis_MetricTon)/0.907

NOX_toNEI_total = sum(CEMS_NEI_distri_emis_p2$NOX_sumCEMS_toNEI)
SO2_toNEI_total = sum(CEMS_NEI_distri_emis_p2$SO2_sumCEMS_toNEI)
CO2_toNEI_total = sum(CEMS_NEI_distri_emis_p2$CO2_sumCEMS_toNEI)

NOX_nativeNEI_total = sum(CEMS_NEI_distri_emis_p2$NOX_NEI,na.rm=TRUE)
SO2_nativeNEI_total = sum(CEMS_NEI_distri_emis_p2$SO2_NEI,na.rm=TRUE)
CO2_nativeNEI_total = sum(CEMS_NEI_distri_emis_p2$CO2_NEI,na.rm=TRUE)

#All is good!

#remove columns not needed
CEMS_NEI_info <- CEMS_NEI_distri_emis_p2[,!names(CEMS_NEI_distri_emis_p2) %in% c("NOX_NEI","SO2_NEI","CO2_NEI")]

colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "CO_NEI"] ="CO"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "NH3_NEI"] ="NH3"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "PM10.PRI_NEI"] ="PM10-PRI"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "PM25.PRI_NEI"] ="PM25-PRI"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "VOC_NEI"] ="VOC"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "NOX_sumCEMS_toNEI"] ="NOX"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "SO2_sumCEMS_toNEI"] ="SO2"
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "CO2_sumCEMS_toNEI"] ="CO2"

#revert back to original CO_NEI that has NAs
NEI_all_CO = subset(NEI_all,select = c("NEI_EGU_ID","CO_NEI"))
CEMS_NEI_info = join(CEMS_NEI_info,NEI_all_CO, by="NEI_EGU_ID", type="left")
CEMS_NEI_info <- CEMS_NEI_info[,!names(CEMS_NEI_info) %in% c("CO")]
colnames(CEMS_NEI_info)[colnames(CEMS_NEI_info) == "CO_NEI"] ="CO"

CEMS_NEI_info <- CEMS_NEI_info[,!names(CEMS_NEI_info) %in% c("NEI_EGU_ID")]

#############################################################################################################################
#turn CEMS_NEI_info into flat file, with one species on each roll as poll and ann_value
spec_list_NEI = c('CO', 'CO2', 'NH3', 'NOX', 'PM10-PRI', 'PM25-PRI', 'SO2', 'VOC')

for (i in 1:8){
  spec_NEI=spec_list_NEI[i]
  CEMS_NEI_info_spec = subset(CEMS_NEI_info, select=c("country_cd","region_cd","tribal_code","facility_id","unit_id","rel_point_id","process_id","scc","facility_name","erptype","stkhgt","stkdiam","stktemp","stkflow","stkvel","naics","longitude","latitude","fac_source_type","unit_type_code","oris_facility_code","oris_boiler_id","zipcode",spec_NEI))
  colnames(CEMS_NEI_info_spec)[colnames(CEMS_NEI_info_spec) == spec_NEI] ="ann_value_NEI"
  CEMS_NEI_info_spec$poll = spec_NEI
  if (i==1){
    CEMS_NEI_info_spec_flat = CEMS_NEI_info_spec
  }else{
    CEMS_NEI_info_spec_flat = rbind(CEMS_NEI_info_spec_flat, CEMS_NEI_info_spec)
  }
}

#############################################################################################################################
CEMS_NEI_info_clean = subset(CEMS_NEI_info, select=c("country_cd","region_cd","tribal_code","facility_id","unit_id","rel_point_id","process_id","scc","facility_name","erptype","stkhgt","stkdiam","stktemp","stkflow","stkvel","naics","longitude","latitude","fac_source_type","unit_type_code","oris_facility_code","oris_boiler_id","zipcode"))
spec_list_template = c('CH4', 'CO', 'CO2', 'EC', 'N2O', 'NH3', 'NO3', 'NOX', 'OC', 'PM-CON', 'PM10-FIL', 'PM10-PRI', 'PM25-FIL', 'PM25-PRI', 'PMFINE', 'SF6', 'SO2', 'SO4', 'VOC')

#for each row of CEMS_NEI_info_clean, make it 19 rows with poll = species
for (rr in 1:nrow(CEMS_NEI_info_clean)) {
  copy = CEMS_NEI_info_clean[rr,]
  for (i in 1:19){
    copy$poll=spec_list_template[i]
    if (i==1){
      stacked = copy
    }else{
      stacked = rbind(stacked, copy)
    }
  }
  if (rr==1){
    CEMS_NEI_info_poll = stacked
  }else{
    CEMS_NEI_info_poll = rbind(CEMS_NEI_info_poll,stacked)
  }
}

#add in other info in template
CEMS_NEI_info_join_template = join(CEMS_NEI_info_poll,EGU_template, by=c("country_cd","region_cd","tribal_code","facility_id","unit_id","rel_point_id","process_id","scc","poll","facility_name",
                                                                         "erptype","stkhgt","stkdiam","stktemp","stkflow","stkvel","naics","longitude","latitude","fac_source_type","unit_type_code","oris_facility_code","oris_boiler_id","zipcode"), type="left")
#organize in the same order as template
CEMS_NEI_info_join_template = subset(CEMS_NEI_info_join_template, select=c("country_cd","region_cd","tribal_code","facility_id","unit_id","rel_point_id","process_id",
                                                           "agy_facility_id", "agy_unit_id", "agy_rel_point_id", "agy_process","scc","poll", "ann_value", "ann_pct_red",
                                                           "facility_name","erptype","stkhgt","stkdiam","stktemp","stkflow","stkvel","naics","longitude","latitude",
                                                           "ll_datum",	"horiz_coll_mthd",	"design_capacity",	"design_capacity_units",	"reg_codes","fac_source_type",
                                                           "unit_type_code","control_ids",	"control_measures",	"current_cost",	"cumulative_cost",	"projection_factor",	
                                                           "submitter_id",	"calc_method",	"data_set_id",	"facil_category_code","oris_facility_code","oris_boiler_id",
                                                           "ipm_yn",	"calc_year",	"date_updated",	"fug_height",	"fug_width_xdim",	"fug_length_ydim",	"fug_angle",
                                                           "zipcode","annual_avg_hours_per_year",	"jan_value",	"feb_value",	"mar_value",	"apr_value",	"may_value",	"jun_value",
                                                           "jul_value",	"aug_value",	"sep_value",	"oct_value",	"nov_value",	"dec_value",	
                                                           "jan_pctred",	"feb_pctred",	"mar_pctred",	"apr_pctred",	"may_pctred",	"jun_pctred",	
                                                           "jul_pctred",	"aug_pctred",	"sep_pctred",	"oct_pctred",	"nov_pctred",	"dec_pctred",	"comment"))

colnames(CEMS_NEI_info_join_template)[colnames(CEMS_NEI_info_join_template) == "ann_value"] ="ann_value_template"

#############################################################################################################################
#double check species emissions "ann_value" by adding emissions from CEMS_NEI_info_spec_flat
CEMS_NEI_info_double_check = join(CEMS_NEI_info_join_template, CEMS_NEI_info_spec_flat, by=c("country_cd","region_cd","tribal_code","facility_id","unit_id","rel_point_id","process_id","scc","facility_name","erptype","stkhgt","stkdiam","stktemp","stkflow","stkvel","naics","longitude","latitude","fac_source_type","unit_type_code","oris_facility_code","oris_boiler_id","zipcode","poll"), type="left")
CEMS_NEI_info_double_check$ann_value_diff = CEMS_NEI_info_double_check$ann_value_NEI - CEMS_NEI_info_double_check$ann_value_template
max_diff = max(CEMS_NEI_info_double_check$ann_value_diff, na.rm = TRUE) #9.99e-10
min_diff = min(CEMS_NEI_info_double_check$ann_value_diff, na.rm = TRUE) #0
#conclusion: emissions in template file are NEI emissions
#NOX, SO2, CO2 emissions, even though named ann_value_NEI, are overwritten by CEMS EGU emissions, so there were noticeable differences between CEMS and NEI emissions

#summarize CEMS_NEI_info_double_check$ann_value = ann_value_NEI or ann_value_template, whichever is not NA
for (rr in 1:nrow(CEMS_NEI_info_double_check)) {
  #both available
  if (!is.na(CEMS_NEI_info_double_check[rr, "ann_value_template"]) && !is.na(CEMS_NEI_info_double_check[rr, "ann_value_NEI"])){
    CEMS_NEI_info_double_check[rr, "ann_value"] = CEMS_NEI_info_double_check[rr, "ann_value_NEI"]
  }
  #ann_value_template available
  if (!is.na(CEMS_NEI_info_double_check[rr, "ann_value_template"]) && is.na(CEMS_NEI_info_double_check[rr, "ann_value_NEI"])){
    CEMS_NEI_info_double_check[rr, "ann_value"] = CEMS_NEI_info_double_check[rr, "ann_value_template"]
  }
  #ann_value_NEI available
  if (is.na(CEMS_NEI_info_double_check[rr, "ann_value_template"]) && !is.na(CEMS_NEI_info_double_check[rr, "ann_value_NEI"])){
    CEMS_NEI_info_double_check[rr, "ann_value"] = CEMS_NEI_info_double_check[rr, "ann_value_NEI"]
  }
  #nothing available
  if (is.na(CEMS_NEI_info_double_check[rr, "ann_value_template"]) && is.na(CEMS_NEI_info_double_check[rr, "ann_value_NEI"])){
    CEMS_NEI_info_double_check[rr, "ann_value"] <- NA
  }
}

#remove unnecessary columns in CEMS_NEI_info_double_check to make it the same format as template file
#organize in the same order as template
CEMS_NEI_info_out = subset(CEMS_NEI_info_double_check, select=c("country_cd","region_cd","tribal_code","facility_id","unit_id","rel_point_id","process_id",
                                                                           "agy_facility_id", "agy_unit_id", "agy_rel_point_id", "agy_process","scc","poll", "ann_value", "ann_pct_red",
                                                                           "facility_name","erptype","stkhgt","stkdiam","stktemp","stkflow","stkvel","naics","longitude","latitude",
                                                                           "ll_datum",	"horiz_coll_mthd",	"design_capacity",	"design_capacity_units",	"reg_codes","fac_source_type",
                                                                           "unit_type_code","control_ids",	"control_measures",	"current_cost",	"cumulative_cost",	"projection_factor",	
                                                                           "submitter_id",	"calc_method",	"data_set_id",	"facil_category_code","oris_facility_code","oris_boiler_id",
                                                                           "ipm_yn",	"calc_year",	"date_updated",	"fug_height",	"fug_width_xdim",	"fug_length_ydim",	"fug_angle",
                                                                           "zipcode","annual_avg_hours_per_year",	"jan_value",	"feb_value",	"mar_value",	"apr_value",	"may_value",	"jun_value",
                                                                           "jul_value",	"aug_value",	"sep_value",	"oct_value",	"nov_value",	"dec_value",	
                                                                           "jan_pctred",	"feb_pctred",	"mar_pctred",	"apr_pctred",	"may_pctred",	"jun_pctred",	
                                                                           "jul_pctred",	"aug_pctred",	"sep_pctred",	"oct_pctred",	"nov_pctred",	"dec_pctred",	"comment"))


#remove NA rows in CEMS_NEI_info_out
CEMS_NEI_info_out_uniq = CEMS_NEI_info_out %>% distinct(facility_id,unit_id,rel_point_id,process_id, .keep_all = TRUE)

CEMS_NEI_info_out = CEMS_NEI_info_out %>% filter(ann_value != 'NA')

CEMS_NEI_info_out_uniq_noNA = CEMS_NEI_info_out %>% distinct(facility_id,unit_id,rel_point_id,process_id, .keep_all = TRUE)

#edit format of output same as template
CEMS_NEI_info_out$region_cd = formatC(CEMS_NEI_info_out$region_cd, width = 5, format = "d", flag = "0")
CEMS_NEI_info_out[CEMS_NEI_info_out == ""] <- NA

#check emissions in output again
CEMS_NEI_info_out_CO2 = CEMS_NEI_info_out%>% filter(poll == 'CO2')
CO2_out_total = sum(CEMS_NEI_info_out_CO2$ann_value)
CEMS_NEI_info_out_SO2 = CEMS_NEI_info_out%>% filter(poll == 'SO2')
SO2_out_total = sum(CEMS_NEI_info_out_SO2$ann_value)
CEMS_NEI_info_out_NOX = CEMS_NEI_info_out%>% filter(poll == 'NOX')
NOX_out_total = sum(CEMS_NEI_info_out_NOX$ann_value)

fwrite(CEMS_NEI_info_out, "egucems202111_2017NEI_POINT_202404.csv")